import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 13:20:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:> (0 + 1) / 1] [Stage 1:> (0 + 1) / 1] 25/10/17 13:21:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2 Data Cleaning
from pyspark.sql.functions import coldf = df.withColumn("SALARY", col("SALARY").cast("float"))df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))df = df.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))df = df.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))# median_from, median_to, median_salarymedian_from = df.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]median_to = df.approxQuantile("SALARY_TO", [0.5], 0.01)[0]median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]print(median_from, median_to, median_salary)# Impute missing 'SALARY_FROM' and 'SALARY_TO' with their mediansdf = df.fillna({"SALARY_FROM": median_from,"SALARY_TO": median_to})# Compute 'AVERAGE_SALARY'df = df.withColumn("AVERAGE_SALARY", (col("SALARY_FROM") + col("SALARY_TO")) /2)# Impute missing 'SALARY' with AVERAGE_SALARY, and if that's missing, with the median salaryfrom pyspark.sql.functions import whendf = df.withColumn("SALARY", when( col("SALARY").isNull(), when(col("AVERAGE_SALARY").isNotNull(), col("AVERAGE_SALARY")) .otherwise(median_salary) ).otherwise(col("SALARY")))from pyspark.sql.functions import regexp_replacedf = df.withColumn("EDUCATION_LEVELS_NAME", regexp_replace(col("EDUCATION_LEVELS_NAME"), r'[\n\r]', ''))# Overwritedf.write.option("header", True).mode("overwrite").csv("data/lightcast_job_postings_cleaned.csv")# Display row countprint(f"Rows retained after cleaning: {df.count()}")
Salaries vary widely between industries, with sectors like Information and Finance & Insurance generally showing higher salary ranges than industries such as Accommodation and Food Services. Full-time positions tend to have higher median salaries across most industries compared to part-time or other employment types.